#functions

multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
  require(grid)

  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                 ncol = cols, nrow = ceiling(numPlots/cols))
}

if (numPlots == 1) {
print(plots[[1]])

} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

for (i in 1:numPlots) {
  matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

  print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                  layout.pos.col = matchidx$col))
}
}
}

plotDist <- function(df, attribute, df_med){
  g <- ggplot(data = df) #, aes(x = eval(parse(text = attribute, n = NA))))
  g <- g + geom_histogram(aes(x = eval(parse(text = attribute, n = NA)),
                              #y=..count..,
                              y=..count../sum(..count..), 
                              #y=..density.., 
                              fill=as.factor(df$tissue)),
                        #colour=as.factor(df$tissue),
                        color = "black",
                        alpha = 0.4, binwidth = 0.1,
                        size = 0.25, 
                        position="identity", 
                        na.rm = T)
  g <- g + scale_fill_manual(NULL, 
                    labels = c("normal" = "Normal",
                               "cancer" = "Tumor"), 
                    values = c("normal" = "#008000",
                               "cancer" = "black"))
  g <- g + scale_color_manual(NULL, 
                    labels = c("normal" = "Normal",
                               "cancer" = "Tumor"), 
                    values = c("normal" = "#008000",
                               "cancer" = "black"))
  g <- g + geom_vline(xintercept = median(as.numeric(unlist(df[tissue == "normal", c(a), with = F])), na.rm = T),
                      linetype="solid", 
                      color = "#008000", size=1)
  g <- g + geom_vline(xintercept = median(as.numeric(unlist(df[tissue == "cancer", c(a), with = F])), na.rm = T),
                      linetype="solid", 
                      color = "black", size=1)
  g <- g + geom_vline(xintercept = df_med[CpGmarker == attribute]$medianByCpG, linetype="dashed", 
                      color = "#F5BB4E", size=1)
  #g <- g + scale_color_discrete(name = NULL, labels = c("Tumor Dist. Median", "Normal Dist. Median",
  #                                                      "Horvath CpGs Median"))
  g <- g + xlim(0.0,1.0)
  g <- g + ggtitle(paste0(attribute), subtitle = paste0("Chr: ", df_med[CpGmarker == attribute]$Chr," Gene: ", df_med[CpGmarker == attribute]$Symbol))
  # g <- g + geom_density(aes(x = eval(parse(text = attribute, n = NA))), #cg01485645),
  #                       color = "red", alpha = 0.3, size = 0.75,
  #                       #binwidth = 0.01,
  #                       stat = "density", #scaled = T,
  #                       #position="identity",
  #                       na.rm = T)
  g <- g + theme_classic(base_family = "palatino", base_size = 11)
  g <- g + ylab("Counts") + xlab("") #xlab(paste0(attribute))
  g <- g + theme(#axis.text.x = element_blank(),
                 #axis.ticks.x = element_blank(),
                 legend.position = "bottom")
  # g <- g + annotate("text", x = 0.,
  #                   y=temp_df[ERCC2_def_fact == "ERCC2 mutant", 
  #                             eval(parse(text = featureName, n = NA))] + 0.05, 
  #                   label = "*", size=5, color = '#008000')
  #print(wilcox.test(as.numeric(unlist(df[tissue == "normal", c(a), with = F])), as.numeric(unlist(df[tissue == "cancer", c(a), with = F])))$p.value)
  fig <- annotate_figure(g,
                  #top = text_grob("Visualizing Tooth Growth", color = "red", face = "bold", size = 14),
                  bottom = text_grob(paste0("Wilcoxon test p-value = ",round(wilcox.test(as.numeric(unlist(df[tissue == "normal", c(a), with = F])), as.numeric(unlist(df[tissue == "cancer", c(a), with = F])))$p.value, 3)), color = "black", hjust = 0.0, x = 0.1, #face = "italic", 
                                     size = 10)
                  #left = text_grob("Figure arranged using ggpubr", color = "green", rot = 90),
                  #right = "I'm done, thanks :-)!",
                  #fig.lab = "Figure 1", 
                  #fig.lab.face = "bold"
                  )
  return(fig)
}

## function to plot PCs 

plotPCA <- function(first, second, Ljustification, Lposition,
                    ellipse = T, tempdf,  colors, labelDT = NULL){
  g <- ggplot(data = tempdf)
  if(!is.null(colors) & colors != F){
    g <- g + geom_point(data = tempdf[colors_text == "black"], aes(x = eval(parse(text=first)), 
                                                                   y = eval(parse(text=second)), 
                                                                   fill = eval(parse(text=colors))), size = 3.5, 
                        alpha = 0.6, shape = 21)
    g <- g + geom_point(data = tempdf[colors_text != "black"], aes(x = eval(parse(text=first)), 
                                                                   y = eval(parse(text=second)), 
                                                                   fill = eval(parse(text=colors))), size = 3.5, 
                        alpha = 0.8, shape = 21)
    } else {
      g <- g + geom_point(aes(x = eval(parse(text=first)), y = eval(parse(text=second))), fill = "black", size = 2.5, 
                          alpha = 0.6, shape = 21)
      }
#  if(!is.null(labelDT)){
#    g <- g + geom_label_repel(data = data_plot, aes(x = eval(parse(text=first)), y = eval(parse(text=second)), 
#                                                  label = PatientID), color = colors_text, 
#                              family = "palatino",size=3,
#                              box.padding = unit(0.5, "lines"),
#                              point.padding = unit(0.5, "lines"),
#                              segment.size = 0.2, alpha = 0.8)
#  }
  if(ellipse == T){
    g <- g + stat_ellipse(aes(x=eval(parse(text=first)), y=eval(parse(text=second)), fill=Gene, color = Gene),
                          type = "t", geom="polygon", alpha = 0.2)
    }
  if(!is.null(colors) & colors != F){
    g <- g + scale_fill_manual(NULL, labels=c("black"="Tumor",
                                              "#008000"="Normal"),
                               values = c("black"="black",
                                          "#008000"="#008000"))
    #g <- g + scale_color_manual("BRCA-genotype", values = c("Wild-type"="#A9A9A9", 
    #                                                  "BRCA1 wild-tpye with LOH"="green", 
    #                                                   "BRCA2 wild-type with LOH"="blue",
    #                                                   "BRCA1 and BRCA2 wild-type with LOH"="red"), guide = F)
    }
  g <- g + scale_x_continuous(first) + scale_y_continuous(second)
  g <- g + theme_classic(base_family = "palatino", base_size = 12)
  g <- g + theme(legend.position = Lposition, legend.justification = Ljustification, 
                 legend.background = element_rect(size = 0.1, color = "black", fill = alpha("white",0.6)))
  return(g)
  #print(g)
  #dev.off()
}

1 Horvath DNAm Age

# #Normalization step
# 
# # Comment regarding the following normalization method based on BMIQ.R# The original BMIQ function from Teschendorff 2013(Bioinformatics. 2013 Jan 15;29(2):189-96) # adjusts for the type-2 bias in Illumina Infinium 450k data.# Later functions and edits were provided by yours truly, Steve Horvath.# I changed the code so that one can calibrate methylation data to a gold standard.# Specifically, I took version v_1.2 by Teschendorff  and fixed minor issues. # Also I made the code more robust e.g. by changing the optimization algorithm.
# 
# # Toward this end, I used the method="Nelder-Mead" in optim()
# source("NORMALIZATION.R")
# 
# #Comment: The file NORMALIZATION.R.txtcontains R function which will only be invoked in Step 3 below.
# 
# #Age transformation and probe annotation functions
# 
# trafo=function(x,adult.age=20) { x=(x+1)/(1+adult.age); y=ifelse(x<=1, log( x),x-1);y }
# 
# anti.trafo=function(x,adult.age=20) { ifelse(x<0, (1+adult.age)*exp(x)-1, (1+adult.age)*x+adult.age) }
# 
# probeAnnotation21kdatMethUsed=read.csv("probeAnnotation21kdatMethUsed.csv")
# probeAnnotation27k=read.csv("datMiniAnnotation27k.csv")
# datClock=read.csv("AdditionalFile3.csv")
# 
# #Read in the DNA methylation data (beta values)
# 
# # For a small file, e.g. measured on the 27k platform you could just use read.csv. 
# # But for large files, e.g. those measured on the 450K platform, I recommend you use read.csv.sql.
# dat0=read.csv.sql("sote_colorectal_methyl_27loci.csv");
# head(dat0)
# dat0 <- as.data.table(dat0)
# dat0 <- dat0[, c("TargetID", "X5652NAT", "X5683NAT", "X5646NAT", "X5663NAT", "X5690NAT",
#          "Tr1NAT", "Tr2NAT", "Tr3NAT", "Tr4NAT", "Tr5NAT", "Tr6NAT", "Tr7NAT")]
# dat0 <- as.data.frame(dat0)
# nSamples=dim(dat0)[[2]]-1
# nProbes= dim(dat0)[[1]]
# 
# # the following command may not be needed. But it is sometimes useful when you use read.csv.sql
# dat0[,1]= gsub(x=dat0 [,1],pattern="\"",replacement="")
# 
# #Create a log file which will be output into your directory
# # The code looks a bit complicated because it serves to create a log file (for error checks etc).
# # It will automatically create a log file.
# file.remove("LogFile.txt")
# file.create("LogFile.txt")
# #DoNotProceed=FALSE
# # cat(paste( "The methylation data set contains", nSamples, "samples (e.g. arrays) and ", nProbes, " probes."),file="LogFile.txt")
# # if (nSamples==0) {DoNotProceed=TRUE; cat(paste( "\n ERROR: There must be a data input error since there seem to be no samples.\n Make sure that you input a comma delimited file (.csv file)\n that can be read using the R command read.csv.sql . Samples correspond to columns in that file  ."), file="LogFile.txt",append=TRUE) }
# # if (nProbes==0) {DoNotProceed=TRUE; cat(paste( "\n ERROR: There must be a data input error since there seem to be zero probes.\n Make sure that you input a comma delimited file (.csv file)\n that can be read using the R command read.csv.sql  CpGs correspond to rows.")   , file="LogFile.txt",append=TRUE) }
# # if (  nSamples > nProbes  ) { cat(paste( "\n MAJOR WARNING: It worries me a lot that there are more samples than CpG probes.\n Make sure that probes correspond to rows and samples to columns.\n I wonder whether you want to first transpose the data and then resubmit them? In any event, I will proceed with the analysis."),file="LogFile.txt",append=TRUE) }
# # if (  is.numeric(dat0[,1]) ) { DoNotProceed=TRUE; cat(paste( "\n Error: The first column does not seem to contain probe identifiers (cg numbers from Illumina) since these entries are numeric values. Make sure that the first column of the file contains probe identifiers such as cg00000292. Instead it contains ", dat0[1:3,1]  ),file="LogFile.txt",append=TRUE)  }
# # if (  !is.character(dat0[,1]) ) {  cat(paste( "\n Major Warning: The first column does not seem to contain probe identifiers (cg numbers from Illumina) since these entries are numeric values. Make sure that the first column of the file contains CpG probe identifiers such as cg00000292. Instead it contains ", dat0[1:3,1]  ),file="LogFile.txt",append=TRUE)  }
# # datout=data.frame(Error=c("Input error. Please check the log file for details","Please read the instructions carefully."), Comment=c("", "email Steve Horvath."))
# 
# #if ( ! DoNotProceed ) {
#   nonNumericColumn=rep(FALSE, dim(dat0)[[2]]-1)
#   for (i in 2:dim(dat0)[[2]] ){ nonNumericColumn[i-1]=! is.numeric(dat0[,i]) }
#   if (  sum(nonNumericColumn) >0 ) { cat(paste( "\n MAJOR WARNING: Possible input error. The following samples contain non-numeric beta values: ", colnames(dat0)[-1][ nonNumericColumn], "\n Hint: Maybe you use the wrong symbols for missing data. Make sure to code missing values as NA in the Excel file. To proceed, I will force the entries into numeric values but make sure this makes sense.\n" ),file="LogFile.txt",append=TRUE)  } 
#   XchromosomalCpGs=as.character(probeAnnotation27k$Name[probeAnnotation27k$Chr=="X"])
#   selectXchromosome=is.element(dat0[,1], XchromosomalCpGs )
#   selectXchromosome[is.na(selectXchromosome)]=FALSE
#   meanXchromosome=rep(NA, dim(dat0)[[2]]-1)
#   if (   sum(selectXchromosome) >=500 )  {
#     meanXchromosome= as.numeric(apply( as.matrix(dat0[selectXchromosome,-1]),2,mean,na.rm=TRUE)) }
#   if (  sum(is.na(meanXchromosome)) >0 ) { cat(paste( "\n \n Comment: There are lots of missing values for X chromosomal probes for some of the samples. This is not a problem when it comes to estimating age but I cannot predict the gender of these samples.\n " ),file="LogFile.txt",append=TRUE)  } 
#   
#   match1=match(probeAnnotation21kdatMethUsed$Name , dat0[,1])
#   if  ( sum( is.na(match1))>0 ) { 
#     missingProbes= probeAnnotation21kdatMethUsed$Name[!is.element( probeAnnotation21kdatMethUsed$Name , dat0[,1])]    
#     DoNotProceed=TRUE; cat(paste( "\n \n Input error: You forgot to include the following ", length(missingProbes), " CpG probes (or probe names):\n ", paste( missingProbes, sep="",collapse=", ")),file="LogFile.txt",append=TRUE)  } 
#   
#   #STEP 2: Restrict the data to 21k probes and ensure they are numeric
#   match1=match(probeAnnotation21kdatMethUsed$Name , dat0[,1])
#   if  ( sum( is.na(match1))>0 ) stop(paste(sum( is.na(match1)), "CpG probes cannot be matched"))
#   dat1= dat0[match1,]
#   asnumeric1=function(x) {as.numeric(as.character(x))}
#   dat1[,-1]=apply(as.matrix(dat1[,-1]),2,asnumeric1)
#   
#   #STEP 3: Create the output file called datout
#   set.seed(1)
#   # Do you want to normalize the data (recommended)?
#   normalizeData=TRUE
#   source("StepwiseAnalysis.txt")
#   
#   # STEP 4: Output the results 
#   if (  sum(  datout$Comment  != "" ) ==0 ) { cat(paste( "\n The individual samples appear to be fine. "),file="LogFile.txt",append=TRUE)  } 
#   if (  sum(  datout$Comment != "" ) >0 ) { cat(paste( "\n Warnings were generated for the following samples.\n", datout[,1][datout$Comment != ""], "\n Hint: Check the output file for more details."),file="LogFile.txt",append=TRUE)  } 
# #} 
# 
# # output the results into the directory
# write.table(datout,"sote_colorectal_output_normal.csv", row.names=F, sep="," )

2 The distribution of the Horvath clock CpGs

From the 353 Horvath CpGs 334 is present on the 850k methylation chip (19 missing). The distribution of methylation values at a given CpG site is presented below. Normal sample distribution is colored with green and a green vertical line indicates the median of the distribution. Tumor sample distribution is colored with balck and a black vertical line indicates the median of the distribution. The yellow vertical line marks the median value of the distribution of the samples used by S. Horvath in his training set. The p-values calculated by the Wilcoxon test show whether the median of the distribution of normal and tumor samples in a given CpG site significantly different from each other.

3 PCA

PCA was performed using the normal colorectal samples. Normal and tumor samples were standardized separately and tumor samples were projected into the 3D space defined by the first three PCs of normal samples.

3.1 Variance explained by the PCs

X_COAD <- as.matrix(methyldf_n)
X_COAD <- scale(X_COAD)
resSVD_COAD <- svd(X_COAD)

percentage <- c(0,resSVD_COAD$d^2/sum(resSVD_COAD$d^2))

tempdt_COAD <- data.table(percent = percentage)
tempdt_COAD[, cum_percent := cumsum(percent)]

#options(repr.plot.width=10, repr.plot.height=5)

g <- ggplot(tempdt_COAD)
#g <- g + geom_hline(aes(yintercept=1), linetype = 1, size = 0.5)
#g <- g + annotate("rect", xmin=0, xmax=7, ymin=0, ymax=Inf, alpha=0.2, fill="black")
g <- g + geom_vline(aes(xintercept = which(tempdt_COAD$cum_percent >= 0.9)[1]-1), 
                    color = "#222222", linetype = "twodash", size = 0.2)
g <- g + geom_hline(data = tempdt_COAD[which(tempdt_COAD$cum_percent >= 0.9)[1],], 
                    aes(yintercept = cum_percent), color = "#222222", linetype = "twodash", size = 0.2)
g <- g + geom_line(aes(x = 0:(nrow(tempdt_COAD)-1), y = cum_percent, linetype = "cumsum"),
                   color = "#CC2200", size = 0.5)
g <- g + geom_point(aes(x = 0:(nrow(tempdt_COAD)-1), y = cum_percent), 
                    color = "black", size = 1.5)
#g <- g + geom_line(aes(x = 1:30, y = percent), size = 1, color = "red")
g <- g + geom_bar(data = tempdt_COAD[percentage > 0], aes(x = 1:(nrow(tempdt_COAD)-1), 
                                                          y = percent, color = "points"), size = 0.1, stat = "identity", alpha = 0.8, fill = "black")
g <- g + scale_linetype_manual(NULL, values=c("cumsum" = 1), labels=c("cumsum"="Cumulative Curve"))
g <- g + scale_color_manual(NULL, values=c("points" = "black"), labels=c("points"="Explained Variance by PC"))
g <- g + scale_x_continuous("Principal components", breaks=c(seq(0,nrow(methyldf_n),3)))
g <- g + scale_y_continuous("Explained Variance", breaks = c(seq(0,1,0.15),1))
g <- g + ggtitle("Variance Explained by the PCs - Normal Samples")
g <- g + theme_classic(base_family = "palatino", base_size = 12)
g <- g + theme(legend.position = c(0.98,0.1),
               legend.justification = c(1,0),
               legend.spacing.y = unit(-0.3, units = "cm"),
               legend.background = element_rect(fill=NA),
               panel.grid.minor = element_blank())
print(g)

As it is shown above, 9 principal components explain more than 90% of the variance in the data.

3.2 Contribution of features to the PCs

Variables that are correlated with PC1 and PC2 are the most important in explaining the variability in the data set. Variables that does not correlate with any PC or correlate with the last dimensions are variables with low contribution and might be removed to simplify the overall analysis.

The contribution of variables accounting for the variability in a given principal component is (in percentage):

\[\begin{equation} \frac{\cos^2(X_i) \cdot 100}{\sum_{i=1} ^{n} \cos^2(X_i)}. \end{equation}\]

If the contribution of the variables were uniform, the expected value in percentage would be \(\frac{1}{334} \cdot 100 = 0.3\%\), as it is indicated on the graph with the red dashed line. For a given component, a variable with a contribution larger than this cutoff could be considered as important in contribution to the component.

3.3 Projections onto the space of the first 3 PCs

# plot 3D scatter plot 
  
#methyldf_n <- scale(methyldf_n)
pca_test <- prcomp(methyldf_n, #retx=TRUE, center=TRUE, 
                   scale=T)
methyldf_c <- scale(methyldf_c)
pred <- predict(pca_test, newdata=methyldf_c)

## normal
temp_list <- c("5652NAT",  "5683NAT",  "5646NAT",  "5663NAT", "5690NAT", "Tr1NAT", "Tr2NAT", "Tr3NAT", "Tr4NAT", "Tr5NAT", "Tr6NAT", "Tr7NAT")
df <- data.frame(matrix(unlist(temp_list), nrow=length(temp_list), byrow=T))
colnames(df) <- c("PatientID")
df$PatientID <- as.character(df$PatientID)
#df
data_plot = as.data.frame(pcs[, 1:3])
colnames(data_plot) <- c("PC1", "PC2", "PC3")
data_plot <- cbind(data_plot, methylData_n_t[, c("tissue")])
data_plot <- cbind(data_plot, df)
data_plot$colors_text <- as.character('#008000')

## tumor
temp_list <- c("5653AD","5684AD","5648CRC","5662CRC","5689CRC","Tr1CRC","Tr2CRC","Tr3CRC","Tr4CRC","Tr5CRC","Tr6CRC","Tr7CRC")
df_c <- data.frame(matrix(unlist(temp_list), nrow=length(temp_list), byrow=T))
colnames(df_c) <- c("PatientID")
df_c$PatientID <- as.character(df_c$PatientID)
#df_c
data_plot_c = as.data.frame(pred[, 1:3])
colnames(data_plot_c) <- c("PC1", "PC2", "PC3")
data_plot_c <- cbind(data_plot_c, methylData_c_t[, c("tissue")])
data_plot_c <- cbind(data_plot_c, df_c)
data_plot_c$colors_text <- as.character('black')

data_plot <- rbind(data_plot, data_plot_c)
#table(data_plot$colors_text)
#head(data_plot)

data_plot$tissue[which(data_plot$tissue == 'normal')] <- 'Normal'
data_plot$tissue[which(data_plot$tissue == 'cancer')] <- 'Tumor'

data_plot$tissue <- as.factor(data_plot$tissue)
#dim(data_plot)
#head(data_plot)

p <- plot_ly(data = data_plot, x = ~PC1, y = ~PC2, z = ~PC3, 
             color = ~tissue, 
             size = 10,
             colors = c('#008000', 'black'), #'#A9A9A9', '#008000', '#C85A80'),
             text = ~PatientID) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')))

#chart_link = api_create(p, filename="scatter3d-basic")
#chart_link
p

4 DNAm age and real age

4.1 Normal samples

4.1.1 Correlation between real age and PCs

age_pcs <- merge(df_n[, -c("colors_text")], data_plot[tissue == "Normal"], by.x = "SampleID", by.y = "PatientID")

oldw <- getOption("warn")
options(warn = -1)

b <- ggplot(age_pcs, aes(x = realAge, y = PC1)) #, fill = Gender))
b <- b + stat_regline_equation(aes(x = realAge, y = PC1,
                                      label =  paste(..rr.label..)), 
                               label.x = 50, label.y = 35)
b <- b + stat_cor(method = "pearson", aes(x = realAge, y = PC1), 
                     label.x = 50, label.y = 30)
b <- b + geom_point(aes(x = realAge, y = PC1, fill = Gender),
                    color = age_pcs$Gender, 
                    size = 4, shape = 23)
#b <- b + geom_smooth(method = lm, se = FALSE)
#b <- b + xlim(30, 100) + ylim(30, 100) 
#b <- b + geom_abline(intercept = 0, slope = 1)
b <- b + theme_classic(base_size = 12, base_family = "palatino")
b <- b + ggtitle("Correlation - Real age and PC1", subtitle = "Normal samples")
b <- b + theme(legend.position = "bottom")

#print(b)

b2 <- ggplot(age_pcs, aes(x = realAge, y = PC2)) #, fill = Gender))
b2 <- b2 + stat_regline_equation(aes(x = realAge, y = PC2,
                                      label =  paste(..rr.label..)), 
                               label.x = 50, label.y = 35)
b2 <- b2 + stat_cor(method = "pearson", aes(x = realAge, y = PC2), 
                     label.x = 50, label.y = 30)
b2 <- b2 + geom_point(aes(x = realAge, y = PC2, fill = Gender),
                    color = age_pcs$Gender, 
                    size = 4, shape = 23)
#b2 <- b2 + geom_smooth(method = lm, se = FALSE)
#b2 <- b2 + xlim(30, 100) + ylim(30, 100) 
#b2 <- b2 + geom_abline(intercept = 0, slope = 1)
b2 <- b2 + theme_classic(base_size = 12, base_family = "palatino")
b2 <- b2 + ggtitle("Correlation - Real age and PC2", subtitle = "Normal samples")
b2 <- b2 + theme(legend.position = "bottom")

#print(b2)

b3 <- ggplot(age_pcs, aes(x = realAge, y = PC3)) #, fill = Gender))
b3 <- b3 + stat_regline_equation(aes(x = realAge, y = PC3,
                                      label =  paste(..rr.label..)), 
                               label.x = 50, label.y = 35)
b3 <- b3 + stat_cor(method = "pearson", aes(x = realAge, y = PC3), 
                     label.x = 50, label.y = 30)
b3 <- b3 + geom_point(aes(x = realAge, y = PC3, fill = Gender),
                    color = age_pcs$Gender, 
                    size = 4, shape = 23)
#b3 <- b3 + geom_smooth(method = lm, se = FALSE)
#b3 <- b3 + xlim(30, 100) + ylim(30, 100) 
#b3 <- b3 + geom_abline(intercept = 0, slope = 1)
b3 <- b3 + theme_classic(base_size = 12, base_family = "palatino")
b3 <- b3 + ggtitle("Correlation - Real age and PC3", subtitle = "Normal samples")
b3 <- b3 + theme(legend.position = "bottom")

#print(b3)

options(warn = oldw)

grid.arrange(b, b2, b3, ncol = 3, nrow = 1)

4.2 Tumor samples

4.2.1 Correlation between real age and PCs

age_pcs <- merge(df_c[, -c("colors_text")], data_plot[tissue == "Tumor"], by.x = "SampleID", by.y = "PatientID")

oldw <- getOption("warn")
options(warn = -1)

b <- ggplot(age_pcs, aes(x = realAge, y = PC1)) #, fill = Gender))
b <- b + stat_regline_equation(aes(x = realAge, y = PC1,
                                      label =  paste(..rr.label..)), 
                               label.x = 50, label.y = 150)
b <- b + stat_cor(method = "pearson", aes(x = realAge, y = PC1), 
                     label.x = 50, label.y = 100)
b <- b + geom_point(aes(x = realAge, y = PC1, fill = Gender),
                    color = age_pcs$Gender, 
                    size = 4, shape = 23)
#b <- b + geom_smooth(method = lm, se = FALSE)
#b <- b + xlim(30, 100) + ylim(30, 100) 
#b <- b + geom_abline(intercept = 0, slope = 1)
b <- b + theme_classic(base_size = 12, base_family = "palatino")
b <- b + ggtitle("Correlation - Real age and PC1", subtitle = "Tumor samples")
b <- b + theme(legend.position = "bottom")

#print(b)

b2 <- ggplot(age_pcs, aes(x = realAge, y = PC2)) #, fill = Gender))
b2 <- b2 + stat_regline_equation(aes(x = realAge, y = PC2,
                                      label =  paste(..rr.label..)), 
                               label.x = 50, label.y = 0)
b2 <- b2 + stat_cor(method = "pearson", aes(x = realAge, y = PC2), 
                     label.x = 50, label.y = -50)
b2 <- b2 + geom_point(aes(x = realAge, y = PC2, fill = Gender),
                    color = age_pcs$Gender, 
                    size = 4, shape = 23)
#b2 <- b2 + geom_smooth(method = lm, se = FALSE)
#b2 <- b2 + xlim(30, 100) + ylim(30, 100) 
#b2 <- b2 + geom_abline(intercept = 0, slope = 1)
b2 <- b2 + theme_classic(base_size = 12, base_family = "palatino")
b2 <- b2 + ggtitle("Correlation - Real age and PC2", subtitle = "Tumor samples")
b2 <- b2 + theme(legend.position = "bottom")

#print(b2)

b3 <- ggplot(age_pcs, aes(x = realAge, y = PC3)) #, fill = Gender))
b3 <- b3 + stat_regline_equation(aes(x = realAge, y = PC3,
                                      label =  paste(..rr.label..)), 
                               label.x = 50, label.y = 120)
b3 <- b3 + stat_cor(method = "pearson", aes(x = realAge, y = PC3), 
                     label.x = 50, label.y = 100)
b3 <- b3 + geom_point(aes(x = realAge, y = PC3, fill = Gender),
                    color = age_pcs$Gender, 
                    size = 4, shape = 23)
#b3 <- b3 + geom_smooth(method = lm, se = FALSE)
#b3 <- b3 + xlim(30, 100) + ylim(30, 100) 
#b3 <- b3 + geom_abline(intercept = 0, slope = 1)
b3 <- b3 + theme_classic(base_size = 12, base_family = "palatino")
b3 <- b3 + ggtitle("Correlation - Real age and PC3", subtitle = "Tumor samples")
b3 <- b3 + theme(legend.position = "bottom")

#print(b3)

options(warn = oldw)

grid.arrange(b, b2, b3, ncol = 3, nrow = 1)